#Set your working directory
setwd("C:\\Users\\Aaron\\Documents\\Gov 2001\\Replication Paper\\Rep_Files")

set.seed(152342)

#Code to produce Figure 1 and associated output
d <- read.delim("Primary_Files/JournalArticles_ReplicationFiles.txt")
d <- d[!is.na(d$Year.Published),]
names(d)
names(d)[5:8] <- c("StateAv","RepRef","RepAv","UpReq")

d$StateAv
table(d$StateAv, d$RepAv)

#Proportion who state that replication files are available:
#"We found that 55% of publications employing statistical analysis stated that replication files were available at a website, though we were only able to find replication files for 38% of publications."
sum(d$StateAv=="Yes" & d$UpReq=="No")/length(d$StateAv)
#.55

sum(d$RepAv=="Yes")/length(d$StateAv)
#.38

#"4% of publications state that replication files are available upon request."
sum(d$UpReq=="Yes")/length(d$StateAv)
#0.04

#Proportion State Available by Journal
sum(d$StateAv=="Yes" & d$UpReq=="No" & d$Journal=="AJPS")/sum(d$Journal=="AJPS")
#.74
sum(d$StateAv=="Yes" & d$UpReq=="No" & d$Journal=="APSR")/sum(d$Journal=="APSR")
#.16

#Proportion Is Available by Journal
sum(d$RepAv=="Yes"  & d$Journal=="AJPS")/sum(d$Journal=="AJPS")
#.51
sum(d$RepAv=="Yes"  & d$Journal=="APSR")/sum(d$Journal=="APSR")
#.13



library("ggplot2")
library(colorspace)

d$y <- rep(NA, length(d[,1]))
d$y[d$StateAv=="Yes"] <- 1 
d$y[d$StateAv=="No"] <- 0

d$y2 <- rep(NA, length(d[,1]))
d$y2[d$RepAv=="Yes"] <- 1 
d$y2[d$RepAv=="No"] <- 0

d$x <- d$Year.Published
d$xj <- d$x + runif(length(d$x),-0.3,0.3)
d$yj <- d$y + runif(length(d$x),-0.03,0.03)
#d$yj <- 0.96*d$y + runif(length(d$x),-0.03,0.03)
d$y2j <- d$y2 + runif(length(d$x),-0.03,0.03)

height <- 6
#y <- y + rnorm(length(d[,1]),0,.1)
p1 <- ggplot(d, aes(y=y, x=as.numeric(Year.Published))) + facet_grid(. ~ Journal) +
  ylab("Publication States that Replication Files Are Available") +  
  xlab("Year") 
#+ ylim(0,1)
p1 + stat_smooth(method="glm", family="binomial", formula= y ~ poly(x,3))  + 
  geom_point(aes(y=d$yj, x=d$xj)) +
  #   theme(axis.ticks=element_blank(), panel.grid.minor.y=element_blank()) +
  #   scale_y_continuous(breaks=c(0, 1), labels=c("No", "Yes"))
  ggsave("Output/Fig1A.pdf", width=1.6*height, height=height)

p2 <- ggplot(d, aes(y=y2, x=as.numeric(Year.Published))) + facet_grid(. ~ Journal) +
  ylab("Replication Files Are Publicly Available") +  
  xlab("Year") 
#+ ylim(0,1)
p2 + stat_smooth(method="glm", family="binomial", formula= y ~ poly(x,3)) +
  geom_point(aes(y=d$y2j, x=d$xj))
ggsave("Output/Fig1B.pdf", width=1.6*height, height=height)


#c <- ggplot(d, aes(y=StateAv, x=Year.Published, colour=factor(Journal)))
#c + stat_smooth(method=lm, aes(fill = factor(Journal)))


rm(list=ls(all=TRUE)) 

#Code to produce Figure 2 and 3 and associated output
###Preparing Data for Public Distribution
##Data was downloaded directly from Qualtrics
d <- read.csv("Primary_Files/2013-08-16-Replication_in_Social_Science.csv")

# v.names <- d[1,]
# names(d) <- as.character(v.names)
# d[1:3,]

#See the first row for the variable definitions
d <- d[-1,]
#d <- d[d$V1!="ResponseID",]

names(d) <- c("ResponseID","ResponseSet","Name", "ExtDataRef", "Email", "IP", "Status", "StartDate", "EndDate", "Finished",
              "Null", "Null2", "Browser", "Browser.Version", "OS", "Resolution", "Flash", "Java", "Agent", 
              "R1.1", "R1.2", "R1.3", "R1.4", "R1.5", "R1.6", "R1.6.text", "R2.1", "R2.2", "R2.3", "R2.4", "R2.5",
              "R2.6", "R2.7", "R2.8", "R3", "Journal", "Year", "Author", "Other.Info", "Contact", "X")

#Removing my test answers
d <- d[d$Contact!="ALlan Dafoe" & d$Contact!="Allan test" & d$Contact!="test" & d$R1.6.text!="allan",]

#Removing Identifying and Unnecessary Variables
#I removed variables related to the authors of the papers that were replicated so as to not make public unsubstantiated criticisms.
d <- d[, c("Name", "StartDate", "EndDate", "Finished", "R1.1", "R1.2", "R1.3", "R1.4", "R1.5", "R1.6", "R1.6.text", "R2.1", "R2.2", "R2.3", "R2.4", "R2.5",
           "R2.6", "R2.7", "R2.8", "R3", "Journal", "Year")]

write.table(d,file="Output/rep_data2.csv",sep=",",row.names=F)


###Begin Public Replication Code
d <- read.csv("Output/rep_data2.csv")



#Respondents
levels(d$Name) <- c("none", "Dafoe", "King", "none", "Polmeth")

#"Students who had taken my "Advanced Quantitative Methods" Ph.D. course in 2012 "
sum(d$Name=="Dafoe")

#"Students who had taken Gary King's "Advanced Quantitative Political Methodology" Ph.D. course"
sum(d$Name=="King")

#"Scholars responding to an email I distributed over the Political Methodology listserve "
sum(d$Name=="Polmeth")

#"responses to the survey"
sum(d$Name=="Polmeth" | d$Name=="King" | d$Name=="Dafoe")



###Cleaning data###
d$Year
d$Year[d$Year==".2007"] <- "2007"

r <- c("R1.1","R1.2", "R1.3", "R1.4", "R1.5", "R1.6")

#Filling in 0s
d[,r][is.na(d[,r])] <- 0
d[,r]

##Responses
responses.w <- c("(1) No replication materials were available online.", 
                 "(2) A limited dataset was available to perform the analysis, but no analysis code.", "(3) A limited dataset was available to perform the analysis and some analysis code.",
                 "(4) A limited dataset was available to perform the analysis and complete analysis code.", "(5) Many primary datasets were available, as well \n as code to create the final dataset from the \n primary datasets and code to analyze the data.",
                 "(6) Other: please explain")
responses.w <- gsub("perform", "perform \n", responses.w)

tot <- sum(d$R1.1==1 | d$R1.2==1 | d$R1.3==1 | d$R1.4==1 | d$R1.5==1 | d$R1.6==1)


responses.n <- c(sum(d$R1.1==1)/tot, 
                 sum(d$R1.2==1)/tot, 
                 sum(d$R1.3==1)/tot, 
                 sum(d$R1.4==1)/tot, 
                 sum(d$R1.5==1)/tot, 
                 sum(d$R1.6==1)/tot)
height <- 6
pdf('Output/Fig2.pdf', width=1.6*height, height=height)
par(mar=c(5.1, 20, 4.1,2.1))
barplot(responses.n, names.arg=responses.w, horiz=TRUE, las=1  , xlab="Proportion of Responses", col="dark blue")
dev.off()
#main="Availability of Replication Files (n=192)"

#Responses for Dafoe class
res <- d$Name=="Dafoe"
tot <- sum(d$R1.1[res]==1 | d$R1.2[res]==1 | d$R1.3[res]==1 | d$R1.4[res]==1 | d$R1.5[res]==1 | d$R1.6[res]==1)

responses.n <- c(sum(d$R1.1[res]==1)/tot, 
                 sum(d$R1.2[res]==1)/tot, 
                 sum(d$R1.3[res]==1)/tot, 
                 sum(d$R1.4[res]==1)/tot, 
                 sum(d$R1.5[res]==1)/tot, 
                 sum(d$R1.6[res]==1)/tot)



height <- 6
pdf('Output/Fig2-c1.pdf', width=1.6*height, height=height)
par(mar=c(5.1, 20, 4.1,2.1))
barplot(responses.n, names.arg=responses.w, horiz=TRUE, las=1,  main="Availability of Replication Files (Dafoe's class, n=16)", xlab="Proportion of Responses", col="dark blue")
dev.off()


#Responses for King's class
res <- d$Name=="King"

tot <- sum(d$R1.1[res]==1 | d$R1.2[res]==1 | d$R1.3[res]==1 | d$R1.4[res]==1 | d$R1.5[res]==1 | d$R1.6[res]==1)

responses.n <- c(sum(d$R1.1[res]==1)/tot, 
                 sum(d$R1.2[res]==1)/tot, 
                 sum(d$R1.3[res]==1)/tot, 
                 sum(d$R1.4[res]==1)/tot, 
                 sum(d$R1.5[res]==1)/tot, 
                 sum(d$R1.6[res]==1)/tot)



height <- 6
pdf('Output/Fig1-c2.pdf', width=1.6*height, height=height)
par(mar=c(5.1, 20, 4.1,2.1))
barplot(responses.n, names.arg=responses.w, horiz=TRUE, las=1,  main="Availability of Replication Files (King's class, n=29)", xlab="Proportion of Responses", col="dark blue")
dev.off()


#Proportion who met standard of providing data and complete analysis code, ignoring Other
#"Ignoring the "Other" category, 36% of these respondents reporte that complete data files and replication code were available"
tot <- d$R1.1==1 | d$R1.2==1 | d$R1.3==1 | d$R1.4==1 | d$R1.5==1 
com <- (d$R1.4==1 | d$R1.5==1) & d$R1.1==0 & d$R1.2==0 & d$R1.3==0
not.com <- (d$R1.1==1 | d$R1.2==1 | d$R1.3==1) & d$R1.4==0 & d$R1.5==0

sum(com)/sum(tot)
#36%


#Proportion for which some aspect of replication files were missing
#"whereas 64% reported that some important element of the replication files were missing"
sum(not.com)/sum(tot)
#64%


#Second question
#Filling in 0s
r <- c("R2.1","R2.2", "R2.3", "R2.4", "R2.5", "R2.6", "R2.7", "R2.8")
d[,r][is.na(d[,r])] <- 0
d[,r]


responses.w2 <- c("(1) I was not able to \n approximately reproduce the main results.", 
                  "(2) I was able to \n approximately reproduce the main results.", "(3) I was able to precisely reproduce the main results.",
                  "(4) I found one or more major technical errors,\n though these didn't change the main results.", "(5) I found that one or more key results \n were driven by a technical error.",
                  "(6) I found that one or more key results \n were driven by an arbitrary aspect of their analysis.", 
                  "(7) I found that one or more key results were fragile \n in a manner that would lead an impartial scholar to \n substantially discount the value of the original study.",
                  "(8) Most or all of the key results were robust.")


res <- TRUE
tot <- d$R2.1[res]==1 | d$R2.2[res]==1 | d$R2.3[res]==1 | d$R2.4[res]==1 | d$R2.5[res]==1 | d$R2.6[res]==1 | d$R2.7[res]==1 | d$R2.8[res]==1
tot <- sum(tot)

responses.n2 <- c(sum(d$R2.1[res]==1)/tot, 
                  sum(d$R2.2[res]==1)/tot, 
                  sum(d$R2.3[res]==1)/tot, 
                  sum(d$R2.4[res]==1)/tot, 
                  sum(d$R2.5[res]==1)/tot, 
                  sum(d$R2.6[res]==1)/tot,
                  sum(d$R2.7[res]==1)/tot,
                  sum(d$R2.8[res]==1)/tot)


height <- 8
pdf('Output/Fig3.pdf', width=1.6*height, height=height)
par(mar=c(5.1, 22, 4.1,2.1))
barplot(responses.n2, names.arg=responses.w2, horiz=TRUE, las=1, xlab="Proportion of Responses", col="dark blue")
dev.off()
# main="`Please describe the results of your replication efforts (select all that apply).'",


#Proportion claiming to precisely replicate results. 
#"On the one hand, of those who responded to the reproducibility of the result (responses 1-3), 52% reported that they were “able to precisely reproduce the main results” and only 13% reported that they were “not able to approximately reproduce the main results.”"
tot <- sum(d$R2.1==1 | d$R2.2==1 | d$R2.3==1) 
sum(d$R2.3==1)/tot

sum(d$R2.1==1)/tot

#Proportion of respondents reporting robust, out of all reporting about robustness.
#"Of those who responded to the robustness of the results (responses 4-8), 36% reported that “most or all of the key results were robust”, 20% that there were “major technical errors though these didn’t change the main results”, and 56% that results were not robust (responses 5-7)."
sum(d$R2.8)/sum(d$R2.4==1 | d$R2.5==1 | d$R2.6==1 | d$R2.7==1 | d$R2.8)
sum(d$R2.4==1)/sum(d$R2.4==1 | d$R2.5==1 | d$R2.6==1 | d$R2.7==1 | d$R2.8)
sum(d$R2.5==1 | d$R2.6==1 | d$R2.7==1)/sum(d$R2.4==1 | d$R2.5==1 | d$R2.6==1 | d$R2.7==1 | d$R2.8)






